library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.
# Load csvs
datExpr = read.delim('./../Data/datExpr.csv')
datMeta = read.delim('./../Data/datPheno.csv')
# Create dataset with gene information
datGenes = data.frame('ensembl_gene_id' = substr(datExpr$Gene, 1, 15),
'hgnc_symbol' = substring(datExpr$Gene, 17))
rownames(datExpr) = datGenes$ensembl_gene_id
datExpr$Gene = NULL
### CLEAN METADATA DATA FRAME
datMeta = datMeta %>% dplyr::select('ID', 'case', 'sampleid', 'brainregion', 'positiononplate',
'Gender', 'Age', 'SiteHM', 'RIN', 'PMI', 'Dx') %>%
mutate(brainregion = substr(ID,1,4)) %>%
mutate(Brain_lobe = ifelse(brainregion=='ba19', 'Occipital', 'Frontal'),
Diagnosis = ifelse(Dx=='Autism', 'ASD', 'CTL')) %>%
mutate(Diagnosis = factor(Diagnosis, levels = c('CTL','ASD'))) %>%
dplyr::rename(Subject_ID = sampleid, Sex = Gender)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n+1)
pal = hcl(h = hues, l = 65, c = 100)[1:n]
}
rm(GO_annotations)
Data description taken from the paper Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism:
Transcriptomes from 104 human brain cortical tissue samples were resolved using next-generation RNA sequencing technology at single-gene resolution and through co-expressing gene clusters or modules. Multiple cortical tissues corresponding to Brodmann Area 19 (BA19), Brodmann Area 10 (BA10) and Brodmann Area 44 (BA44) were sequenced in 62, 14 and 28 samples, respectively, resulting in a total of 57 (40 unique individuals) control and 47 (32 unique individuals) autism samples.
Brain tissue: Frozen brain samples were acquired through the Autism Tissue Program, with samples originating from two different sites: the Harvard Brain Tissue Resource Center and the NICHD Brain and Tissue Bank at the University of Maryland (Gandal’s data were obtained also from the Autism Tissue Program, specifically from the Harvard Brain Bank).
Sequenced using Illumina’s HiSeq 2000 (Gandal used Illumina HiSeq 2500, they are compatible).
The dataset includes 62069 genes from 120 samples belonging to 72 different subjects.
In the paper they talk about an original number of 110 samples and dropping 6 because of low gene coverage, resulting in 104 samples (which are the ones that are included in datMeta), but the expression dataset has 120 samples.
no_metadata_samples = colnames(datExpr)[! colnames(datExpr) %in% datMeta$ID]
no_metadata_subjects = unique(substring(no_metadata_samples, 6))
Samples without Metadata information: ba10.s11, ba10.s12, ba10.s21, ba10.s24, ba10.s87, ba19.s13, ba19.s21, ba19.s54, ba19.s60, ba19.s87, ba44.s12, ba44.s21, ba44.s23, ba44.s24, ba44.s77, ba44.s87
Samples without metadata but with a Subject ID that is included in the Metadata data frame s11, s13, s60, s23
Since we need the Phenotype information of the samples, I’m going to keep the samples that I can map to an existing Subject ID in the metadata and remove the others
add_metadata_subjects = no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID]
add_metadata_samples = no_metadata_samples[grepl(paste(add_metadata_subjects, collapse='|'),
no_metadata_samples)]
for(sample in add_metadata_samples){
new_row = datMeta %>% filter(Subject_ID == strsplit(sample,'\\.')[[1]][2]) %>% dplyr::slice(1) %>%
mutate(ID = sample, brainregion = strsplit(sample,'\\.')[[1]][1],
Brain_lobe = ifelse(strsplit(sample,'\\.')[[1]][1]=='ba19','Occipital','Frontal'))
datMeta = rbind(datMeta, new_row)
}
keep = substring(colnames(datExpr), 6) %in% datMeta$Subject_ID
datExpr = datExpr[,keep]
# Match order of datExpr columns and datMeta rows
datMeta = datMeta[match(colnames(datExpr), datMeta$ID),]
# Check they are in the same order
if(!all(colnames(datExpr) == datMeta$ID)){
cat('\norder of samples don\'t match between datExpr and datMeta!\n')
}
rm(no_metadata_subjects, no_metadata_samples, add_metadata_subjects, add_metadata_samples, sample, new_row)
Removing 12 samples (ba10.s14, ba10.s69, ba10.s72, ba19.s13, ba19.s28, ba19.s62, ba44.s1, ba44.s28, ba44.s44, ba44.s48, NA, NA) belonging to subjects with ID s14, s69, s72, s13, s28, s62, s1, s44, s48, NA
The dataset now has 108 samples belonging to 72 different subjects.
rm(keep)
Counts distribution: More than half of the counts are zero and most of the counts are relatively low, but there are some very high outliers
counts = datExpr %>% melt
count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
max(counts$value)))
count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
| Statistic | Values |
|---|---|
| Min | 0.00 |
| 1st Quartile | 0.00 |
| Median | 0.00 |
| Mean | 250.81 |
| 3rd Quartile | 8.00 |
| Max | 7718873.00 |
rm(counts, count_distr)
Diagnosis distribution by Sample: There are a few more CTL samples than ASD
table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe',
SiteHM = 'Processing Group', Sex = 'Gender')
cro(table_info$Diagnosis)
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 58 |
| ASD | 50 |
| #Total cases | 108 |
Diagnosis distribution by Subject: There are more control subjects than ASD ones
cro(table_info$Diagnosis[!duplicated(table_info$Subject_ID)])
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 40 |
| ASD | 32 |
| #Total cases | 72 |
Brain region distribution: The Occipital lobe has more samples than the Frontal lobe, even though we are combining two brain regions in the Frontal Lobe
cro(table_info$Brain_lobe)
| #Total | |
|---|---|
| Brain Lobe | |
| Frontal | 44 |
| Occipital | 64 |
| #Total cases | 108 |
Most of the Control samples are from the Occipital lobe, the Autism samples are balanced between both lobes. This could cause problems because Control and Occipital are related
cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
| Brain Lobe | #Total | |||
|---|---|---|---|---|
| Frontal | Occipital | |||
| Diagnosis | ||||
| CTL | 19 | 39 | 58 | |
| ASD | 25 | 25 | 50 | |
| #Total cases | 44 | 64 | 108 | |
Gender distribution: There are thrice as many Male samples than Female ones
cro(table_info$Sex)
| #Total | |
|---|---|
| Gender | |
| F | 26 |
| M | 82 |
| #Total cases | 108 |
There is a small imbalance between gender and diagnosis with more males in the control group than in the autism group
cro(table_info$Diagnosis, list(table_info$Sex, total()))
| Gender | #Total | |||
|---|---|---|---|---|
| F | M | |||
| Diagnosis | ||||
| CTL | 12 | 46 | 58 | |
| ASD | 14 | 36 | 50 | |
| #Total cases | 26 | 82 | 108 | |
Age distribution: Subjects between 2 and 82 years old with a mean of 18 years old
summary(datMeta$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 8.75 18.00 20.49 22.00 82.00
datMeta_by_subject = datMeta %>% filter(!duplicated(Subject_ID))
datMeta_by_subject %>% ggplot(aes(Age)) +
geom_density(alpha=0.2, aes(group = Diagnosis, fill = Diagnosis, color = Diagnosis)) +
geom_density(alpha=0.3, fill='gray', color='gray') +
theme_minimal()
rm(datMeta_by_subject)
Processing Group and Diagnosis: There is a relation between processing group and Diagnosis! This can be a problem because processing group is an important Batch Correction covariate, but if it’s related to Diagnosis, we risk inadvertedly losing information related to ASD
cro(table_info$Diagnosis, list(table_info$SiteHM,total()))
| Processing Group | #Total | |||
|---|---|---|---|---|
| H | M | |||
| Diagnosis | ||||
| CTL | 13 | 45 | 58 | |
| ASD | 38 | 12 | 50 | |
| #Total cases | 51 | 57 | 108 | |
rm(table_info)
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)
Annotate genes with Biotype labels:
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes
labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4))
########################################################################################
# 1. Query archive version
getinfo = c('ensembl_gene_id','chromosome_name','start_position','end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_BM = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), values = rownames(datExpr),
mart = mart)
datGenes = datGenes %>% left_join(datGenes_BM, by = 'ensembl_gene_id')
datGenes$length = datGenes$end_position - datGenes$start_position
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 1580/62069 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 42070/62069 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart,
values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/',
sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9621/42070 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
values = missing_genes)
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 6295/8036 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 18 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'),
values = missing_ensembl_ids, mart=mart)
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1219/7594 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% add_row(source = 'missing',
n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>%
mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
'NCBI','missing')))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length
Considering all genes, this dataset contains 910 of the 912 SFARI genes
1.- Filter entries for which we didn’t manage to obtain its genotype information
1.1 Missing Biotype
to_keep = !is.na(datGenes$gene_biotype)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
Removed 1219 ‘genes’, 60850 remaining
Filtering genes without biotype information, we are left with 910 SFARI Genes (we lose 0 genes)
1.2 Missing Length of the sequence
to_keep = !is.na(datGenes$length)
datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]
Removed 361 ‘genes’, 60489 remaining
Filtering genes without sequence length information, we are left with 910 SFARI Genes (we lose 0 genes)
2.- Filter genes that do not encode any protein
37% of the genes are protein coding genes
datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
kable_styling(full_width = F)
| . | Freq |
|---|---|
| protein_coding | 22113 |
| lncRNA | 11392 |
| processed_pseudogene | 9380 |
| unprocessed_pseudogene | 2461 |
| miRNA | 2212 |
| misc_RNA | 2156 |
| snRNA | 2017 |
| 1 | 1929 |
| pseudogene | 1353 |
| snoRNA | 1178 |
| lincRNA | 661 |
| transcribed_unprocessed_pseudogene | 654 |
| rRNA_pseudogene | 485 |
| transcribed_processed_pseudogene | 418 |
| antisense | 315 |
| 6 | 304 |
| 3 | 301 |
| IG_V_pseudogene | 168 |
| TR_V_gene | 146 |
| IG_V_gene | 134 |
| transcribed_unitary_pseudogene | 90 |
| TR_J_gene | 81 |
| unitary_pseudogene | 74 |
| processed_transcript | 59 |
| rRNA | 58 |
| TR_V_pseudogene | 46 |
| sense_intronic | 43 |
| sense_overlapping | 38 |
| scaRNA | 31 |
| polymorphic_pseudogene | 28 |
| IG_D_gene | 27 |
| Mt_tRNA | 22 |
| IG_J_gene | 18 |
| 4 | 17 |
| 7 | 16 |
| IG_C_gene | 14 |
| TEC | 9 |
| IG_C_pseudogene | 8 |
| TR_C_gene | 8 |
| ribozyme | 5 |
| TR_J_pseudogene | 5 |
| 3prime_overlapping_ncrna | 4 |
| IG_J_pseudogene | 3 |
| TR_D_gene | 3 |
| Mt_rRNA | 2 |
| 8 | 1 |
| translated_processed_pseudogene | 1 |
| translated_unprocessed_pseudogene | 1 |
Most of the non-protein coding genes have very low levels of expression
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) +
geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
Filtering protein coding genes, we are left with 906 SFARI Genes (we lose 4 genes)
Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out
n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length
df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>%
dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost Genes') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000187951 | ARHGAP11B | 3 | lncRNA | 0 | 2 |
| ENSG00000251593 | MSNP1AS | 2 | processed_pseudogene | 0 | 12 |
| ENSG00000233067 | PTCHD1-AS | 2 | lncRNA | 0 | 3 |
| ENSG00000197558 | SSPO | 3 | transcribed_unitary_pseudogene | 0 | 4 |
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
Removed 38376 genes. 22113 remaining
3.- Filter genes with low expression levels
\(\qquad\) 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr) > 0
df = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums == 0 & !is.na(`gene-score`)) %>%
arrange(`gene-score`) %>% dplyr::select(-ensembl_gene_id) %>%
filter(!duplicated(`gene-symbol`), !`gene-symbol` %in% datGenes$hgnc_symbol[to_keep])
datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]
Removed 2803 genes. 19310 remaining
904 SFARI genes remaining (we lost 2 genes)
df %>% dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost SFARI Genes') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000267910 | KMT2A | 1 | protein_coding | 0 | 20 |
| ENSG00000235718 | MFRP | 2 | protein_coding | 0 | 6 |
| ENSG00000167014 | TERB2 | 3 | protein_coding | 0 | 1 |
n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length
rm(df)
\(\qquad\) 3.2 Removing genes with a high percentage of zeros
Choosing the threshold:
Criteria for selecting the percentage of zeros threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)
On the plot I’m using the “dual” of the maximum percentage of zeros, the minimum percentage of non zeros so the visualisation is more intuitive
75% seems to be a good threshold for the minimum percentage of non zeros, so 25% will be the maximum percentage of zeros allowed in a row
The Mean vs SD plot doesn’t show all of the genes, a random sample was selected for the genes with higher level of expression so the visualisation wouldn’t be as heavy (and since we care about the genes with the lowest levels of expression, we aren’t losing important information)
datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = round(100*(ncol(datExpr)-c(seq(100,7,-10),5,3,2,0))/ncol(datExpr))
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
rm(absadj, netsummary, ku, z.ku, to_keep)
# Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(start=datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis)
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
} else {
new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())